Oral cancer, a major subtype of head and neck cancers, remains one of the most aggressive malignancies globally. Oral squamous cell carcinoma (OSCC) often arises from premalignant lesions such as leukoplakia and progresses through intermediate pathological stages, including hyperkeratosis with no recurrence (HkNR) and dysplasia, before evolving into invasive cancer. Despite therapeutic advancements, early detection remains a challenge, contributing to a persistently low 5-year survival rate.
In this study, we analyze transcriptomic data from the GEO dataset GSE227919, which includes samples from healthy controls, dysplasia, HkNR, and cancer tissues. By performing differential expression analysis across these sequential stages—HkNR vs Control, Dysplasia vs HkNR, and Cancer vs Dysplasia—we aim to capture key transcriptional changes along the disease trajectory. Enrichment analyses, including g:Profiler and GSEA (Gene Set Enrichment Analysis), are employed to identify dysregulated pathways. This approach enables us to explore the molecular landscape of oral cancer progression and potentially highlight targets for early intervention and therapeutic development.
library(tidyverse)
library(limma)
library(edgeR)
library(pheatmap)
library(plotly)
library(VennDiagram)
library(gprofiler2)
library(clusterProfiler)
library(msigdbr)
library(ggvenn)
library(DT)
library(knitr)
library(purrr)
expression_data <- read.table("project_data.txt", header = TRUE, row.names = 1) %>% as.matrix()
metadata <- read.csv("project_metadata.csv")
common_samples <- intersect(colnames(expression_data), metadata$sample)
expr_mat <- expression_data[, common_samples]
meta4 <- metadata %>% filter(sample %in% common_samples) %>% arrange(match(sample, common_samples))
rownames(meta4) <- meta4$sample
meta4$Stage <- factor(meta4$group, levels = c("Control", "HkNR", "Dysplasia", "Cancer"), ordered = TRUE)
log_expr <- log2(expr_mat + 1)
pca <- prcomp(t(log_expr), scale. = TRUE)
pca_df <- as.data.frame(pca$x)
pca_df$Stage <- meta4$Stage
ggplot(pca_df, aes(x = PC1, y = PC2, color = Stage)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "PCA of Samples")
design4 <- model.matrix(~0 + Stage, data = meta4)
colnames(design4) <- levels(meta4$Stage)
dge <- DGEList(counts = expr_mat)
dge <- calcNormFactors(dge)
v <- voom(dge, design4, plot = FALSE)
fit <- lmFit(v, design4)
cont4 <- makeContrasts(
HkNR_vs_Control = HkNR - Control,
Dys_vs_HkNR = Dysplasia - HkNR,
Canc_vs_Dys = Cancer - Dysplasia,
Canc_vs_Control = Cancer - Control,
levels = design4
)
fit2 <- contrasts.fit(fit, cont4) %>% eBayes()
de_HkNR_vs_Control <- topTable(fit2, coef = "HkNR_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Dys_vs_HkNR <- topTable(fit2, coef = "Dys_vs_HkNR", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_Dys <- topTable(fit2, coef = "Canc_vs_Dys", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
de_Canc_vs_Control <- topTable(fit2, coef = "Canc_vs_Control", adjust.method = "BH", number = Inf) %>% rownames_to_column("geneID")
get_top_deg_table <- function(de_df, contrast_label) {
de_df <- de_df %>%
mutate(Direction = case_when(
adj.P.Val < 0.05 & logFC > 1 ~ "Upregulated",
adj.P.Val < 0.05 & logFC < -1 ~ "Downregulated",
TRUE ~ "Not Significant"
)) %>%
filter(Direction %in% c("Upregulated", "Downregulated")) %>%
arrange(desc(abs(logFC))) %>%
slice(1:20)
cat(paste0("### Top 20 DEGs: ", contrast_label, "\n\n"))
datatable(de_df, options = list(pageLength = 10), rownames = FALSE)
}
get_top_deg_table(de_HkNR_vs_Control, "HkNR vs Control")
get_top_deg_table(de_Dys_vs_HkNR, "Dysplasia vs HkNR")
get_top_deg_table(de_Canc_vs_Dys, "Cancer vs Dysplasia")
get_top_deg_table(de_Canc_vs_Control, "Cancer vs Control")
plot_volcano <- function(df, title) {
df <- df %>% mutate(logP = -log10(adj.P.Val + 1e-300),
Direction = case_when(
adj.P.Val < 0.05 & logFC > 1 ~ "Up",
adj.P.Val < 0.05 & logFC < -1 ~ "Down",
TRUE ~ "NS"))
plot_ly(df, x = ~logFC, y = ~logP, color = ~Direction, colors = c("blue", "grey", "red"),
text = ~paste("Gene:", geneID, "<br>logFC:", round(logFC,2), "<br>FDR:", signif(adj.P.Val, 3)),
hoverinfo = "text", type = 'scatter', mode = 'markers') %>%
layout(title = title, xaxis = list(title = "log2FC"), yaxis = list(title = "-log10(FDR)"))
}
plot_volcano(de_HkNR_vs_Control, "Volcano: HkNR vs Control")
plot_volcano(de_Dys_vs_HkNR, "Volcano: Dysplasia vs HkNR")
plot_volcano(de_Canc_vs_Dys, "Volcano: Cancer vs Dysplasia")
plot_volcano(de_Canc_vs_Control, "Volcano: Cancer vs Control")
genes1 <- de_HkNR_vs_Control %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes2 <- de_Dys_vs_HkNR %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
genes3 <- de_Canc_vs_Dys %>% filter(adj.P.Val < 0.05) %>% pull(geneID)
venn_list <- list(
"HkNR vs Control" = genes1,
"Dys vs HkNR" = genes2,
"Cancer vs Dys" = genes3
)
ggvenn(venn_list, fill_color = c("red", "green", "blue"), text_size = 4, set_name_size = 5)
top_genes <- unique(c(
de_HkNR_vs_Control %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
de_Dys_vs_HkNR %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID),
de_Canc_vs_Dys %>% arrange(adj.P.Val) %>% slice(1:5) %>% pull(geneID)
))
heat_mat <- log_expr[top_genes, ] # expression of the pre‑selected genes
ord_samps <- meta4 %>% arrange(Stage) %>% pull(sample)
ann <- data.frame(Stage = meta4$Stage)
rownames(ann) <- meta4$sample
pheatmap(
heat_mat[, ord_samps],
annotation_col = ann[ord_samps, , drop = FALSE],
cluster_rows = TRUE,
cluster_cols = FALSE,
show_rownames = TRUE,
fontsize_row = 6,
main = "Top DEGs Across All Stages"
)
sig_genes <- genes3
gostres <- gost(query = sig_genes, organism = "hsapiens", correction_method = "fdr")
gostplot(gostres, interactive = TRUE, capped = TRUE)
expr_long <- log_expr[top_genes, ] %>% as.data.frame() %>%
rownames_to_column("Gene") %>%
pivot_longer(-Gene, names_to = "sample", values_to = "logExpr") %>%
left_join(meta4, by = "sample")
avg_expr <- expr_long %>% group_by(Gene, Stage) %>% summarise(meanExpr = mean(logExpr), .groups = "drop")
ggplot(avg_expr, aes(x = Stage, y = meanExpr, group = 1)) +
geom_line(color = "steelblue", size = 1) +
geom_point(size = 2) +
facet_wrap(~ Gene, scales = "free_y") +
theme_minimal(base_size = 13) +
labs(title = "Gene-wise Expression Trajectories", y = "Mean log2(Expression + 1)", x = "Stage")
library(BiocParallel)
BPPARAM <- SerialParam() # Single line fix for parallel issues
hs_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
select(gs_name, gene_symbol)
rank_and_run_gsea <- function(de_df, label) {
ranked <- deframe(de_df[, c("geneID", "logFC")]) %>% sort(decreasing = TRUE)
GSEA(ranked, TERM2GENE = hs_c2, verbose = FALSE, BPPARAM = BPPARAM)@result %>%
transmute(ID, Description, NES, p.adjust, Stage = label)
}
df_gsea <- bind_rows(
rank_and_run_gsea(de_HkNR_vs_Control, "HkNR vs Control"),
rank_and_run_gsea(de_Dys_vs_HkNR, "Dysplasia vs HkNR"),
rank_and_run_gsea(de_Canc_vs_Dys, "Cancer vs Dysplasia")
)
top_terms <- df_gsea %>%
filter(p.adjust < 0.05) %>%
group_by(ID) %>%
summarise(mean_NES = mean(abs(NES)), .groups = "drop") %>%
slice_max(mean_NES, n = 10) %>%
pull(ID)
ggplot(df_gsea %>% filter(ID %in% top_terms),
aes(x = Stage, y = NES, group = ID, color = ID)) +
geom_line(size = 1) + geom_point(size = 2) +
theme_minimal(base_size = 13) +
labs(title = "GSEA: NES Trajectories Across Stages", x = "Comparison", y = "NES") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))